{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   index       gpa         gre  actual_label  fold\n",
      "0    364  3.348772  605.968197             0     1\n",
      "1      9  3.910495  463.470183             0     1\n",
      "2    618  3.037396  665.090554             1     1\n",
      "3     87  3.059940  477.230992             0     1\n",
      "4     12  3.018922  567.714830             0     1\n",
      "     index       gpa         gre  actual_label  fold\n",
      "639    489  3.179946  617.568367             1     5\n",
      "640    237  3.237353  643.485972             0     5\n",
      "641    128  3.467073  604.016912             0     5\n",
      "642    600  3.621650  689.847374             1     5\n",
      "643    486  3.594541  661.634438             1     5\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "admissions = pd.read_csv(\"admissions.csv\")\n",
    "admissions[\"actual_label\"] = admissions[\"admit\"]\n",
    "admissions = admissions.drop(\"admit\", axis=1)\n",
    "\n",
    "shuffled_index = np.random.permutation(admissions.index)\n",
    "shuffled_admissions = admissions.loc[shuffled_index]\n",
    "admissions = shuffled_admissions.reset_index()\n",
    "admissions.ix[0:128, \"fold\"] = 1\n",
    "admissions.ix[129:257, \"fold\"] = 2\n",
    "admissions.ix[258:386, \"fold\"] = 3\n",
    "admissions.ix[387:514, \"fold\"] = 4\n",
    "admissions.ix[515:644, \"fold\"] = 5\n",
    "# Ensure the column is set to integer type.\n",
    "admissions[\"fold\"] = admissions[\"fold\"].astype('int')\n",
    "\n",
    "print(admissions.head())\n",
    "print(admissions.tail())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.713178294574\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "e:\\python27\\lib\\site-packages\\ipykernel\\__main__.py:10: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
     ]
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "# Training\n",
    "model = LogisticRegression()\n",
    "train_iteration_one = admissions[admissions[\"fold\"] != 1]\n",
    "test_iteration_one = admissions[admissions[\"fold\"] == 1]\n",
    "model.fit(train_iteration_one[[\"gpa\"]], train_iteration_one[\"actual_label\"])\n",
    "\n",
    "# Predicting\n",
    "labels = model.predict(test_iteration_one[[\"gpa\"]])\n",
    "test_iteration_one[\"predicted_label\"] = labels\n",
    "\n",
    "matches = test_iteration_one[\"predicted_label\"] == test_iteration_one[\"actual_label\"]\n",
    "correct_predictions = test_iteration_one[matches]\n",
    "iteration_one_accuracy = len(correct_predictions) / float(len(test_iteration_one))\n",
    "print(iteration_one_accuracy)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.7131782945736435, 0.5813953488372093, 0.6511627906976745, 0.609375, 0.6434108527131783]\n",
      "0.639704457364\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "e:\\python27\\lib\\site-packages\\ipykernel\\__main__.py:11: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "fold_ids = [1,2,3,4,5]\n",
    "def train_and_test(df, folds):\n",
    "    fold_accuracies = []\n",
    "    for fold in folds:\n",
    "        model = LogisticRegression()\n",
    "        train = admissions[admissions[\"fold\"] != fold]\n",
    "        test = admissions[admissions[\"fold\"] == fold]\n",
    "        model.fit(train[[\"gpa\"]], train[\"actual_label\"])\n",
    "        labels = model.predict(test[[\"gpa\"]])\n",
    "        test[\"predicted_label\"] = labels\n",
    "\n",
    "        matches = test[\"predicted_label\"] == test[\"actual_label\"]\n",
    "        correct_predictions = test[matches]\n",
    "        fold_accuracies.append(len(correct_predictions) / float(len(test)))\n",
    "    return(fold_accuracies)\n",
    "\n",
    "accuracies = train_and_test(admissions, fold_ids)\n",
    "print(accuracies)\n",
    "average_accuracy = np.mean(accuracies)\n",
    "print(average_accuracy)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 0.70790123  0.69550265  0.65987934  0.73363017  0.57864583]\n",
      "0.675111844524\n"
     ]
    }
   ],
   "source": [
    "from sklearn.cross_validation import KFold\n",
    "from sklearn.cross_validation import cross_val_score\n",
    "\n",
    "admissions = pd.read_csv(\"admissions.csv\")\n",
    "admissions[\"actual_label\"] = admissions[\"admit\"]\n",
    "admissions = admissions.drop(\"admit\", axis=1)\n",
    "\n",
    "kf = KFold(len(admissions), 5, shuffle=True, random_state=8)\n",
    "lr = LogisticRegression()\n",
    "#roc_auc \n",
    "accuracies = cross_val_score(lr,admissions[[\"gpa\"]], admissions[\"actual_label\"], scoring=\"roc_auc\", cv=kf)\n",
    "average_accuracy = sum(accuracies) / len(accuracies)\n",
    "\n",
    "print(accuracies)\n",
    "print(average_accuracy)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
